import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data=pd.read_csv('Billionaires Statistics Dataset.csv')
data.head()
C:\Users\Anjal\anaconda3\lib\site-packages\scipy\__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.26.2
warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
| rank | finalWorth | category | personName | age | country | city | source | industries | countryOfCitizenship | ... | cpi_change_country | gdp_country | gross_tertiary_education_enrollment | gross_primary_education_enrollment_country | life_expectancy_country | tax_revenue_country_country | total_tax_rate_country | population_country | latitude_country | longitude_country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 211000 | Fashion & Retail | Bernard Arnault & family | 74.0 | France | Paris | LVMH | Fashion & Retail | France | ... | 1.1 | $2,715,518,274,227 | 65.6 | 102.5 | 82.5 | 24.2 | 60.7 | 67059887.0 | 46.227638 | 2.213749 |
| 1 | 2 | 180000 | Automotive | Elon Musk | 51.0 | United States | Austin | Tesla, SpaceX | Automotive | United States | ... | 7.5 | $21,427,700,000,000 | 88.2 | 101.8 | 78.5 | 9.6 | 36.6 | 328239523.0 | 37.090240 | -95.712891 |
| 2 | 3 | 114000 | Technology | Jeff Bezos | 59.0 | United States | Medina | Amazon | Technology | United States | ... | 7.5 | $21,427,700,000,000 | 88.2 | 101.8 | 78.5 | 9.6 | 36.6 | 328239523.0 | 37.090240 | -95.712891 |
| 3 | 4 | 107000 | Technology | Larry Ellison | 78.0 | United States | Lanai | Oracle | Technology | United States | ... | 7.5 | $21,427,700,000,000 | 88.2 | 101.8 | 78.5 | 9.6 | 36.6 | 328239523.0 | 37.090240 | -95.712891 |
| 4 | 5 | 106000 | Finance & Investments | Warren Buffett | 92.0 | United States | Omaha | Berkshire Hathaway | Finance & Investments | United States | ... | 7.5 | $21,427,700,000,000 | 88.2 | 101.8 | 78.5 | 9.6 | 36.6 | 328239523.0 | 37.090240 | -95.712891 |
5 rows × 35 columns
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2640 entries, 0 to 2639 Data columns (total 35 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 rank 2640 non-null int64 1 finalWorth 2640 non-null int64 2 category 2640 non-null object 3 personName 2640 non-null object 4 age 2640 non-null float64 5 country 2640 non-null object 6 city 2568 non-null object 7 source 2640 non-null object 8 industries 2640 non-null object 9 countryOfCitizenship 2640 non-null object 10 organization 325 non-null object 11 selfMade 2640 non-null bool 12 status 2640 non-null object 13 gender 2640 non-null object 14 birthDate 2564 non-null object 15 lastName 2640 non-null object 16 firstName 2637 non-null object 17 title 339 non-null object 18 date 2640 non-null object 19 state 753 non-null object 20 residenceStateRegion 747 non-null object 21 birthYear 2564 non-null float64 22 birthMonth 2564 non-null float64 23 birthDay 2564 non-null float64 24 cpi_country 2456 non-null float64 25 cpi_change_country 2456 non-null float64 26 gdp_country 2476 non-null object 27 gross_tertiary_education_enrollment 2458 non-null float64 28 gross_primary_education_enrollment_country 2459 non-null float64 29 life_expectancy_country 2458 non-null float64 30 tax_revenue_country_country 2457 non-null float64 31 total_tax_rate_country 2458 non-null float64 32 population_country 2476 non-null float64 33 latitude_country 2476 non-null float64 34 longitude_country 2476 non-null float64 dtypes: bool(1), float64(14), int64(2), object(18) memory usage: 704.0+ KB
data.describe()
| rank | finalWorth | age | birthYear | birthMonth | birthDay | cpi_country | cpi_change_country | gross_tertiary_education_enrollment | gross_primary_education_enrollment_country | life_expectancy_country | tax_revenue_country_country | total_tax_rate_country | population_country | latitude_country | longitude_country | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2640.000000 | 2640.000000 | 2575.000000 | 2564.000000 | 2564.000000 | 2564.000000 | 2456.000000 | 2456.000000 | 2458.000000 | 2459.000000 | 2458.000000 | 2457.000000 | 2458.000000 | 2.476000e+03 | 2476.000000 | 2476.000000 |
| mean | 1289.159091 | 4623.787879 | 65.140194 | 1957.183307 | 5.740250 | 12.099844 | 127.755204 | 4.364169 | 67.225671 | 102.858520 | 78.122823 | 12.546235 | 43.963344 | 5.102053e+08 | 34.903592 | 12.583156 |
| std | 739.693726 | 9834.240939 | 13.258098 | 13.282516 | 3.710085 | 9.918876 | 26.452951 | 3.623763 | 21.343426 | 4.710977 | 3.730099 | 5.368625 | 12.145296 | 5.542447e+08 | 17.003497 | 86.762989 |
| min | 1.000000 | 1000.000000 | 18.000000 | 1921.000000 | 1.000000 | 1.000000 | 99.550000 | -1.900000 | 4.000000 | 84.700000 | 54.300000 | 0.100000 | 9.900000 | 3.801900e+04 | -40.900557 | -106.346771 |
| 25% | 659.000000 | 1500.000000 | 56.000000 | 1948.000000 | 2.000000 | 1.000000 | 117.240000 | 1.700000 | 50.600000 | 100.200000 | 77.000000 | 9.600000 | 36.600000 | 6.683440e+07 | 35.861660 | -95.712891 |
| 50% | 1312.000000 | 2300.000000 | 65.000000 | 1957.000000 | 6.000000 | 11.000000 | 117.240000 | 2.900000 | 65.600000 | 101.800000 | 78.500000 | 9.600000 | 41.200000 | 3.282395e+08 | 37.090240 | 10.451526 |
| 75% | 1905.000000 | 4200.000000 | 75.000000 | 1966.000000 | 9.000000 | 21.000000 | 125.080000 | 7.500000 | 88.200000 | 102.600000 | 80.900000 | 12.800000 | 59.100000 | 1.366418e+09 | 40.463667 | 104.195397 |
| max | 2540.000000 | 211000.000000 | 101.000000 | 2004.000000 | 12.000000 | 31.000000 | 288.570000 | 53.500000 | 136.600000 | 142.100000 | 84.200000 | 37.200000 | 106.300000 | 1.397715e+09 | 61.924110 | 174.885971 |
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
# Assuming 'df' is your DataFrame
# If not, replace 'df' with your actual DataFrame variable
# Create a heatmap of null values
plt.figure(figsize=(12, 8))
sns.heatmap(data.isnull(), cmap='viridis', cbar=False, yticklabels=False)
plt.title('Visualization of Null Values in the Dataset')
plt.show()
data['age']=data['age'].fillna(data['age'].median())
data['age'].isnull().sum()
0
data['country']=data['country'].fillna('unknown')
data['country'].isnull().sum()
0
data.gdp_country=data.gdp_country.str.replace('$','')
data.gdp_country.head()
C:\Users\Anjal\AppData\Local\Temp\ipykernel_17492\3207136862.py:1: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
data.gdp_country=data.gdp_country.str.replace('$','')
0 2,715,518,274,227 1 21,427,700,000,000 2 21,427,700,000,000 3 21,427,700,000,000 4 21,427,700,000,000 Name: gdp_country, dtype: object
# Wealth distribution analysis: Explore the distribution of billionaires' wealth across different
# industries, countries, and regions.
top_industry =data['category'].value_counts().head(10)
plt.figure(figsize=(20,6))
sns.barplot(x=top_industry.index, y=top_industry)
plt.title("Top Ten Bilionaires by Industry", fontsize=14)
plt.xlabel("Industry", fontsize=12)
plt.ylabel("Amount of Billionaires", fontsize=12)
Text(0, 0.5, 'Amount of Billionaires')
top_countries=data['country'].value_counts().head(10)
top_countries
United States 754 China 523 India 157 Germany 102 United Kingdom 82 Russia 79 Switzerland 78 Hong Kong 68 Italy 55 Singapore 46 Name: country, dtype: int64
plt.figure(figsize=(15,6))
sns.barplot(x=top_countries.index,y=top_countries)
plt.title("Top Ten Bilionaires by Industry", fontsize=14)
plt.xlabel("country", fontsize=12)
plt.ylabel("Amount of Billionaires", fontsize=12)
Text(0, 0.5, 'Amount of Billionaires')
top_cities=data['city'].value_counts().head(10)
top_cities
New York 99 Beijing 68 Hong Kong 68 Shanghai 64 London 61 Moscow 60 Mumbai 56 Shenzhen 54 Singapore 45 Delhi 37 Name: city, dtype: int64
plt.figure(figsize=(15,6))
sns.barplot(x=top_cities.index,y=top_cities)
plt.title("Top Ten Bilionaires by Region", fontsize=14)
plt.xlabel("city", fontsize=12)
plt.ylabel("Amount of Billionaires", fontsize=12)
Text(0, 0.5, 'Amount of Billionaires')
# Investigate the age, gender, and birthplace demographics of billionaires.
plt.figure(figsize=(20,6))
sns.histplot(data=data,x='age' , bins=30, kde=True)
<AxesSubplot:xlabel='age', ylabel='Count'>
plt.figure(figsize=(20,6))
#sns.histplot(data=data,x='gender',bins=10)
#data["gender"].value_counts().plot(kind="bar", color="blue")
<Figure size 2000x600 with 0 Axes>
<Figure size 2000x600 with 0 Axes>
coc=data['countryOfCitizenship'].value_counts().head(10)
plt.figure(figsize=(20,6))
sns.barplot(x=coc.index,y=coc)
<AxesSubplot:ylabel='countryOfCitizenship'>
# Analyze the proportion of self-made billionaires and those who inherited their wealth.
data.selfMade.head()
0 False 1 True 2 True 3 True 4 True Name: selfMade, dtype: bool
self_made = data["selfMade"].value_counts()
self_made
True 1812 False 828 Name: selfMade, dtype: int64
# Trends over time: Track changes in billionaire demographics and wealth over the age.]
fw=data['finalWorth'].head(10)
plt.figure(figsize=(20,6))
sns.barplot(data=data,x=fw.index,y=fw,color='red')
<AxesSubplot:ylabel='finalWorth'>
data.finalWorth.head(10)
0 211000 1 180000 2 114000 3 107000 4 106000 5 104000 6 94500 7 93000 8 83400 9 80700 Name: finalWorth, dtype: int64
import plotly.express as px
fig = px.scatter_3d(data, x='age', y='finalWorth', z='category', color='category')
fig.update_layout(title='Final Worth vs. Age and Category')
fig.show()
fig = px.scatter(data, x='age', y='finalWorth', size='finalWorth', color='category')
fig.update_layout(title='Final Worth vs. Age')
fig.show()
# Data Transformation of Top 50 Ranked as World Billionaires
loca_top50 <- data.frame(rank = as.character(df_billion$rank),
Name = df_billion$personName,
Assets = df_billion$finalWorth,
Residence_Country = df_billion$country,
lat = df_billion$latitude_country,
lng = df_billion$longitude_country) %>%
head(50)
# Plot the market into the World Map
map <- leaflet()
map <- addTiles(map = map)
addMarkers(map = map, data = loca_top50, popup = glue("<h5>Rank = {loca_top50$rank}</p>
Name = {loca_top50$Name}</p>
Assets = $ {comma(loca_top50$Assets)} B</p>
Country Residence = {loca_top50$Residence_Country}
"))
File "C:\Users\Anjal\AppData\Local\Temp\ipykernel_17492\81726786.py", line 2 loca_top50 <- data.frame(rank = as.character(df_billion$rank), ^ SyntaxError: invalid syntax
# Extract year from the 'date' column
data['year'] = data['date'].dt.year
# Group by year and calculate the mean wealth
wealth_distribution_over_time = data.groupby('year')['finalWorth'].mean().reset_index()
# Plotting the trend over time
plt.figure(figsize=(12, 6))
sns.lineplot(x='year', y='finalWorth', data=wealth_distribution_over_time, marker='o')
plt.title('Wealth Distribution Trend Over Time')
plt.xlabel('Year')
plt.ylabel('Average Wealth (in billions)')
plt.grid(True)
plt.show()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) ~\AppData\Local\Temp\ipykernel_17492\3249720849.py in <module> 1 # Extract year from the 'date' column ----> 2 data['year'] = data['date'].dt.year 3 4 # Group by year and calculate the mean wealth 5 wealth_distribution_over_time = data.groupby('year')['finalWorth'].mean().reset_index() ~\anaconda3\lib\site-packages\pandas\core\generic.py in __getattr__(self, name) 5900 ): 5901 return self[name] -> 5902 return object.__getattribute__(self, name) 5903 5904 def __setattr__(self, name: str, value) -> None: ~\anaconda3\lib\site-packages\pandas\core\accessor.py in __get__(self, obj, cls) 180 # we're accessing the attribute of the class, i.e., Dataset.geo 181 return self._accessor --> 182 accessor_obj = self._accessor(obj) 183 # Replace the property with the accessor object. Inspired by: 184 # https://www.pydanny.com/cached-property.html ~\anaconda3\lib\site-packages\pandas\core\indexes\accessors.py in __new__(cls, data) 510 return PeriodProperties(data, orig) 511 --> 512 raise AttributeError("Can only use .dt accessor with datetimelike values") AttributeError: Can only use .dt accessor with datetimelike values
data['date']
0 4/4/2023 5:01
1 4/4/2023 5:01
2 4/4/2023 5:01
3 4/4/2023 5:01
4 4/4/2023 5:01
...
2635 4/4/2023 5:01
2636 4/4/2023 5:01
2637 4/4/2023 5:01
2638 4/4/2023 5:01
2639 4/4/2023 5:01
Name: date, Length: 2640, dtype: object
!geopandas
Collecting geopandas
Downloading geopandas-0.14.1-py3-none-any.whl (1.1 MB)
---------------------------------------- 1.1/1.1 MB 8.8 MB/s eta 0:00:00
Requirement already satisfied: pandas>=1.4.0 in c:\users\anjal\anaconda3\lib\site-packages (from geopandas) (1.5.3)
Collecting shapely>=1.8.0
Downloading shapely-2.0.2-cp39-cp39-win_amd64.whl (1.4 MB)
---------------------------------------- 1.4/1.4 MB 8.3 MB/s eta 0:00:00
Collecting fiona>=1.8.21
Downloading fiona-1.9.5-cp39-cp39-win_amd64.whl (22.9 MB)
--------------------------------------- 22.9/22.9 MB 17.2 MB/s eta 0:00:00
Requirement already satisfied: packaging in c:\users\anjal\anaconda3\lib\site-packages (from geopandas) (21.3)
Collecting pyproj>=3.3.0
Downloading pyproj-3.6.1-cp39-cp39-win_amd64.whl (6.1 MB)
---------------------------------------- 6.1/6.1 MB 32.5 MB/s eta 0:00:00
Requirement already satisfied: click~=8.0 in c:\users\anjal\anaconda3\lib\site-packages (from fiona>=1.8.21->geopandas) (8.0.4)
Requirement already satisfied: six in c:\users\anjal\anaconda3\lib\site-packages (from fiona>=1.8.21->geopandas) (1.16.0)
Requirement already satisfied: importlib-metadata in c:\users\anjal\anaconda3\lib\site-packages (from fiona>=1.8.21->geopandas) (4.11.3)
Collecting click-plugins>=1.0
Downloading click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Requirement already satisfied: certifi in c:\users\anjal\anaconda3\lib\site-packages (from fiona>=1.8.21->geopandas) (2023.7.22)
Requirement already satisfied: setuptools in c:\users\anjal\anaconda3\lib\site-packages (from fiona>=1.8.21->geopandas) (63.4.1)
Collecting cligj>=0.5
Downloading cligj-0.7.2-py3-none-any.whl (7.1 kB)
Requirement already satisfied: attrs>=19.2.0 in c:\users\anjal\anaconda3\lib\site-packages (from fiona>=1.8.21->geopandas) (21.4.0)
Requirement already satisfied: numpy>=1.20.3 in c:\users\anjal\anaconda3\lib\site-packages (from pandas>=1.4.0->geopandas) (1.26.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\anjal\anaconda3\lib\site-packages (from pandas>=1.4.0->geopandas) (2022.1)
Requirement already satisfied: python-dateutil>=2.8.1 in c:\users\anjal\anaconda3\lib\site-packages (from pandas>=1.4.0->geopandas) (2.8.2)
Requirement already satisfied: pyparsing!=3.0.5,>=2.0.2 in c:\users\anjal\anaconda3\lib\site-packages (from packaging->geopandas) (3.0.9)
Requirement already satisfied: colorama in c:\users\anjal\anaconda3\lib\site-packages (from click~=8.0->fiona>=1.8.21->geopandas) (0.4.5)
Requirement already satisfied: zipp>=0.5 in c:\users\anjal\anaconda3\lib\site-packages (from importlib-metadata->fiona>=1.8.21->geopandas) (3.8.0)
Installing collected packages: shapely, pyproj, cligj, click-plugins, fiona, geopandas
Successfully installed click-plugins-1.1.1 cligj-0.7.2 fiona-1.9.5 geopandas-0.14.1 pyproj-3.6.1 shapely-2.0.2
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
# Assuming 'latitude' and 'longitude' columns in your DataFrame
geometry = [Point(xy) for xy in zip(data['longitude_country'], data['latitude_country'])]
geo_df = gpd.GeoDataFrame(data, geometry=geometry)
# World map shapefile (download from https://www.naturalearthdata.com/)
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# Plotting the map
fig, ax = plt.subplots(figsize=(15, 10))
world.plot(ax=ax, color='lightgrey')
# Plot billionaire locations on top
geo_df.plot(ax=ax, markersize=30, color='red', alpha=0.7)
plt.title('Distribution of Billionaires Worldwide')
plt.show()
C:\Users\Anjal\AppData\Local\Temp\ipykernel_17492\2621660113.py:10: FutureWarning: The geopandas.dataset module is deprecated and will be removed in GeoPandas 1.0. You can get the original 'naturalearth_lowres' data from https://www.naturalearthdata.com/downloads/110m-cultural-vectors/.
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
# Visualizing Sources of Wealth by Region
plt.figure(figsize=(15, 5))
sns.countplot(x='region', hue='selfMade', data=data, palette='viridis')
plt.title('Sources of Wealth for Billionaires by Region')
plt.xlabel('Region')
plt.ylabel('Number of Billionaires')
plt.legend(title='Self-Made', loc='upper right')
plt.show()
# Select the top 10 regions based on the number of billionaires
top_regions = data['country'].value_counts().nlargest(10).index
data_top10 = data[data['country'].isin(top_regions)]
plt.figure(figsize=(15, 5))
sns.boxplot(x='country', y='finalWorth', data=data_top10, palette='viridis')
plt.title('Wealth Distribution of Billionaires in Top 10 Regions')
plt.xlabel('Region')
plt.ylabel('Final Worth (in billions)')
plt.yscale('log') # Use log scale for better visualization if wealth varies significantly
plt.show()
top_industries = data['industries'].value_counts().nlargest(10).index
data_top_industries = data[data['industries'].isin(top_industries)]
plt.figure(figsize=(15, 7))
sns.countplot(x='industries', hue='gender', data=data_top_industries, palette='viridis')
plt.title('Gender Distribution in Top Industries')
plt.xlabel('Industry')
plt.ylabel('Number of Billionaires')
plt.xticks(rotation=45, ha='right')
plt.legend(title='Gender', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# Drop rows with missing values in relevant columns
data = data.dropna(subset=['selfMade', 'age', 'gender', 'industries'])
# Select relevant features and target variable
X = data[['age', 'gender', 'industries']]
y = data['selfMade']
# Convert categorical variables to numerical using one-hot encoding
X = pd.get_dummies(X, columns=['gender', 'industries'], drop_first=True)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)
# Predictions on the test set
predictions = model.predict(X_test)
# Model Evaluation
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy:.2f}')
# Classification Report
print('Classification Report:')
print(classification_report(y_test, predictions))
# Confusion Matrix
plt.figure(figsize=(8, 6))
cm = confusion_matrix(y_test, predictions)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={"size": 16})
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
Accuracy: 0.74
Classification Report:
precision recall f1-score support
False 0.70 0.30 0.42 166
True 0.75 0.94 0.83 362
accuracy 0.74 528
macro avg 0.73 0.62 0.63 528
weighted avg 0.73 0.74 0.70 528
C:\Users\Anjal\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:814: ConvergenceWarning:
lbfgs failed to converge (status=1):
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.
Increase the number of iterations (max_iter) or scale the data as shown in:
https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
# ROC Curve
from sklearn.metrics import roc_curve, auc
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (AUC = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend()
plt.show()
plt.figure(figsize=(10, 6))
sns.histplot(predictions, kde=True, color='skyblue')
plt.title('Distribution of Predicted Probabilities for Being Self-Made')
plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.show()
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) ~\AppData\Local\Temp\ipykernel_17492\4272087344.py in <module> 1 plt.figure(figsize=(10, 6)) ----> 2 sns.histplot(predictions, kde=True, color='skyblue') 3 plt.title('Distribution of Predicted Probabilities for Being Self-Made') 4 plt.xlabel('Predicted Probability') 5 plt.ylabel('Frequency') ~\anaconda3\lib\site-packages\seaborn\distributions.py in histplot(data, x, y, hue, weights, stat, bins, binwidth, binrange, discrete, cumulative, common_bins, common_norm, multiple, element, fill, shrink, kde, kde_kws, line_kws, thresh, pthresh, pmax, cbar, cbar_ax, cbar_kws, palette, hue_order, hue_norm, color, log_scale, legend, ax, **kwargs) 1460 if p.univariate: 1461 -> 1462 p.plot_univariate_histogram( 1463 multiple=multiple, 1464 element=element, ~\anaconda3\lib\site-packages\seaborn\distributions.py in plot_univariate_histogram(self, multiple, element, fill, common_norm, common_bins, shrink, kde, kde_kws, color, legend, line_kws, estimate_kws, **plot_kws) 416 kde_kws["cumulative"] = estimate_kws["cumulative"] 417 log_scale = self._log_scaled(self.data_variable) --> 418 densities = self._compute_univariate_density( 419 self.data_variable, 420 common_norm, ~\anaconda3\lib\site-packages\seaborn\distributions.py in _compute_univariate_density(self, data_variable, common_norm, common_grid, estimate_kws, log_scale, warn_singular) 324 325 # Estimate the density of observations at this level --> 326 density, support = estimator(observations, weights=weights) 327 328 if log_scale: ~\anaconda3\lib\site-packages\seaborn\_statistics.py in __call__(self, x1, x2, weights) 185 """Fit and evaluate on univariate or bivariate data.""" 186 if x2 is None: --> 187 return self._eval_univariate(x1, weights) 188 else: 189 return self._eval_bivariate(x1, x2, weights) ~\anaconda3\lib\site-packages\seaborn\_statistics.py in _eval_univariate(self, x, weights) 144 support = self.support 145 if support is None: --> 146 support = self.define_support(x, cache=False) 147 148 kde = self._fit(x, weights) ~\anaconda3\lib\site-packages\seaborn\_statistics.py in define_support(self, x1, x2, weights, cache) 117 """Create the evaluation grid for a given data set.""" 118 if x2 is None: --> 119 support = self._define_support_univariate(x1, weights) 120 else: 121 support = self._define_support_bivariate(x1, x2, weights) ~\anaconda3\lib\site-packages\seaborn\_statistics.py in _define_support_univariate(self, x, weights) 89 def _define_support_univariate(self, x, weights): 90 """Create a 1D grid of evaluation points.""" ---> 91 kde = self._fit(x, weights) 92 bw = np.sqrt(kde.covariance.squeeze()) 93 grid = self._define_support_grid( ~\anaconda3\lib\site-packages\seaborn\_statistics.py in _fit(self, fit_data, weights) 135 fit_kws["weights"] = weights 136 --> 137 kde = stats.gaussian_kde(fit_data, **fit_kws) 138 kde.set_bandwidth(kde.factor * self.bw_adjust) 139 ~\anaconda3\lib\site-packages\scipy\stats\_kde.py in __init__(self, dataset, bw_method, weights) 205 self._neff = 1/sum(self._weights**2) 206 --> 207 self.set_bandwidth(bw_method=bw_method) 208 209 def evaluate(self, points): ~\anaconda3\lib\site-packages\scipy\stats\_kde.py in set_bandwidth(self, bw_method) 553 raise ValueError(msg) 554 --> 555 self._compute_covariance() 556 557 def _compute_covariance(self): ~\anaconda3\lib\site-packages\scipy\stats\_kde.py in _compute_covariance(self) 565 bias=False, 566 aweights=self.weights)) --> 567 self._data_inv_cov = linalg.inv(self._data_covariance) 568 569 self.covariance = self._data_covariance * self.factor**2 ~\anaconda3\lib\site-packages\scipy\linalg\_basic.py in inv(a, overwrite_a, check_finite) 925 926 """ --> 927 a1 = _asarray_validated(a, check_finite=check_finite) 928 if len(a1.shape) != 2 or a1.shape[0] != a1.shape[1]: 929 raise ValueError('expected square matrix') ~\anaconda3\lib\site-packages\scipy\_lib\_util.py in _asarray_validated(a, check_finite, sparse_ok, objects_ok, mask_ok, as_inexact) 288 if not objects_ok: 289 if a.dtype is np.dtype('O'): --> 290 raise ValueError('object arrays are not supported') 291 if as_inexact: 292 if not np.issubdtype(a.dtype, np.inexact): ValueError: object arrays are not supported
plt.figure(figsize=(8, 6))
sns.countplot(x='selfMade', data=data, palette='viridis')
plt.title('Distribution of Self-Made Status')
plt.xlabel('Self-Made')
plt.ylabel('Count')
plt.show()